from __future__ import print_function, division
import sys, os, time, pickle, mkl
#limit the number of threds numpy/scipy are using
nthreads = 15
mkl.set_num_threads(nthreads)
os.environ["OMP_NUM_THREADS"] = str(nthreads)
import numpy as np
import multiprocessing as mp
from sklearn.datasets.samples_generator import make_blobs
# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
mpl.rcParams['figure.titlesize'] = 30
mpl.rcParams['axes.titlesize'] = 30
mpl.rcParams['axes.labelsize'] = 20
mpl.rcParams['xtick.labelsize'] = 20
mpl.rcParams['ytick.labelsize'] = 20
mpl.rcParams['legend.fontsize'] = 20
mpl.rc('text', usetex=True)
# Custom pyscripts
maindir = '/home/lanhuong/Projects/ManifoldLearning/DiffusionTSNE'
#maindir = '/Users/lanhuong/MEGA/BIOSTATS GROUP/Projects/ManifoldLearning/DiffusionTSNE'
#maindir = '/home/lanhuong/MEGA/BIOSTATS GROUP/Projects/ManifoldLearning/DiffusionTSNE'
os.chdir(maindir)
from diffusion_tsne import diffusion_tsne
from plotting import *
from generate_data import *
from utils import *
from metrics import *
%load_ext autoreload
%autoreload 2
MACHINE_EPSILON = np.finfo(np.double).eps
def bw(betas):
return np.sqrt(1/(2*betas))
clst_means = [[0, 0], [15, 15]]
clst_std = [1, 1]
clst_n_samples = [2500, 2500]
X, color = make_blobs(
n_samples=clst_n_samples, n_features=len(clst_means[0]),
centers=clst_means, cluster_std = clst_std, random_state=0)
idx = np.argsort(color)
X = X[idx, :]
color = color[idx]
color = np.array([["C1", "C2"][i] for i in color])
X.shape
plot2D(X, label=color, s=15, figsize=(7,7), alpha=0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
Y_fitsne = diffusion_tsne(
X, perplexity= 300, seed=123, nthreads=nthreads,
scale_probs=False, load_affinities = "save only")
betas = np.fromfile('betas.dat', dtype=np.dtype('d'))
plot2D(X, label=1/(2*betas), s=15, figsize=(8,7))
plt.axis('equal')
from sklearn.manifold import TSNE
perp = 300
start = time.time()
Y = TSNE(n_components=2, perplexity=perp).fit_transform(X)
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y, label=color, s=15, figsize=(7,7), alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fixed = diffusion_tsne(X, perplexity=-1, sigma = 2, K = perp,
seed=123, nthreads=nthreads,
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
print('Fixed bandwidth t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fixed, label=color, s=15, figsize=(7,7), alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fitsne, loss = diffusion_tsne(
X, perplexity= perp, seed=123, nthreads=nthreads,
scale_probs=False, load_affinities = "save", return_loss = True,
knn_algo='vp-tree', nbody_algo = "Barnes-Hut")
end = time.time()
betas = np.fromfile('betas.dat', dtype=np.dtype('d'))
degrees = np.fromfile('affinity_rowsums.dat', dtype=np.dtype('d'))
meandist = np.fromfile('mean_dists.dat', dtype=np.dtype('d'))
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fitsne, label=color, s=15, figsize=(7,7), alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fitsne_scale, loss = diffusion_tsne(
X, perplexity= perp, seed=123, nthreads=nthreads,
scale_probs=True, load_affinities = "save", return_loss = True,
max_iter=5000, early_exag_coeff = 12, learning_rate = 50,
knn_algo='vp-tree' ,nbody_algo = "Barnes-Hut")
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fitsne_scale, label=color, s=15, figsize=(7,7), alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
plot2D(Y_fitsne_scale, label=betas, s=15, figsize=(8,7))
plt.axis('equal')
plot2D(Y_fitsne_scale, label=degrees, s=15, figsize=(8,7))
plt.axis('equal')
plot2D(Y_fitsne_scale, label=meandist, s=25, figsize=(8,7))
plt.axis('equal')
clst_means = [[0, 0], [15, 15]]
clst_std = [1, 2]
clst_n_samples = [1000, 4*1000]
X1, color1 = make_blobs(
n_samples=clst_n_samples, n_features=len(clst_means[0]),
centers=clst_means, cluster_std = clst_std, random_state=0)
idx = np.argsort(color1)
X1 = X1[idx, :]
color1 = color1[idx]
color1 = np.array([["C1", "C2"][i] for i in color1])
plot2D(X1, label=color1, col_map = ["forestgreen", "rebeccapurple"],
s=15, figsize=(7,7), alpha = 0.5)
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
Z = diffusion_tsne(
X1, perplexity= 300, seed=123, nthreads=nthreads,
scale_probs=False, load_affinities = "save only")
betas1 = np.fromfile('betas.dat', dtype=np.dtype('d'))
plot2D(X1, label=1/(2*betas1), s=15, figsize=(8,7))
plt.axis('equal')
from sklearn.manifold import TSNE
start = time.time()
Y1 = TSNE(n_components=2, perplexity=perp).fit_transform(X1)
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y1, label=color1, s=15, figsize=(7,7),
col_map = ["forestgreen", "rebeccapurple"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fixed1 = diffusion_tsne(X1, perplexity=-1, sigma = 2, K = perp,
seed=123, nthreads=nthreads,
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
print('Fixed bandwidth t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fixed1, label=color1, s=15, figsize=(7,7),
col_map = ["forestgreen", "rebeccapurple"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fitsne1 = diffusion_tsne(
X1, perplexity=perp, seed=3465, nthreads=nthreads,
scale_probs=False, load_affinities = "save",
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
betas1 = np.fromfile('betas.dat', dtype=np.dtype('d'))
degrees1 = np.fromfile('affinity_rowsums.dat', dtype=np.dtype('d'))
meandist1 = np.fromfile('mean_dists.dat', dtype=np.dtype('d'))
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fitsne1, label=color1, col_map = ["forestgreen", "rebeccapurple"],
s=15, figsize=(7,7), alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fitsne_scale1 = diffusion_tsne(
X1, perplexity=perp, seed=756, nthreads=nthreads,
max_iter=5000, early_exag_coeff = 12, learning_rate = 200,
scale_probs=True, load_affinities = "save",
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fitsne_scale1, label=color1, s=15, figsize=(7,7),
col_map = ["forestgreen", "rebeccapurple"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
plot2D(Y_fitsne_scale1, label=betas1, s=15, figsize=(8,7))
plt.axis('equal')
plot2D(Y_fitsne_scale1, label=degrees1, s=15, figsize=(8,7))
plt.axis('equal')
plot2D(X1, label=betas1, s=15, figsize=(8,7))
plt.axis('equal')
plot2D(Y_fitsne_scale1, label=meandist1, s=25, figsize=(8,7))
plt.axis('equal')
Different number of observation, same variance
clst_means = [[0, 0], [15, 15]]
clst_std = [1, 1]
clst_n_samples = [4000, 1000]
X2, color2 = make_blobs(
n_samples=clst_n_samples, n_features=len(clst_means[0]),
centers=clst_means, cluster_std = clst_std, random_state=0)
idx = np.argsort(color2)
X2 = X2[idx, :]
color2 = color2[idx]
color2 = np.array([["C1", "C2"][i] for i in color2])
plot2D(X2, label=color2, s=15, figsize=(7,7),
col_map = ["darkblue", "orangered"], alpha = 0.5)
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
Z = diffusion_tsne(
X2, perplexity= 300, seed=123, nthreads=nthreads,
scale_probs=False, load_affinities = "save only")
betas2 = np.fromfile('betas.dat', dtype=np.dtype('d'))
plot2D(X2, label=1/(2*betas2), s=15, figsize=(8,7))
plt.axis('equal')
from sklearn.manifold import TSNE
start = time.time()
Y2 = TSNE(n_components=2, perplexity=perp).fit_transform(X2)
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y2, label=color2, s=15, figsize=(7,7),
col_map = ["darkblue", "orangered"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fixed2 = diffusion_tsne(
X2, perplexity=-1, sigma = 2, K = perp,
seed=123, nthreads=nthreads,
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
print('Fixed bandwidth t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fixed2, label=color2, s=15, figsize=(7,7),
col_map = ["darkblue", "orangered"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fitsne2 = diffusion_tsne(
X2, perplexity=perp, seed=123, nthreads=nthreads,
scale_probs=False, load_affinities = "save",
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
betas2 = np.fromfile('betas.dat', dtype=np.dtype('d'))
degrees2 = np.fromfile('affinity_rowsums.dat', dtype=np.dtype('d'))
meandist2 = np.fromfile('mean_dists.dat', dtype=np.dtype('d'))
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fitsne2, label=color2, s=15, figsize=(7,7),
col_map = ["darkblue", "orangered"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fitsne_scale2, loss = diffusion_tsne(
X2, perplexity=perp, seed=123, nthreads=nthreads,
max_iter=5000, early_exag_coeff = 12, learning_rate = 500,
scale_probs=True, load_affinities = "save", return_loss = True,
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fitsne_scale2, label=color2, s=15, figsize=(7,7),
col_map = ["darkblue", "orangered"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
betas2 = np.fromfile('betas.dat', dtype=np.dtype('d'))
degrees2 = np.fromfile('affinity_rowsums.dat', dtype=np.dtype('d'))
meandist2 = np.fromfile('mean_dists.dat', dtype=np.dtype('d'))
plot2D(X2, label=betas2, s=15, figsize=(8,7))
plt.axis('equal')
plt.scatter(range(len(loss)), loss)
plot2D(Y_fitsne_scale2, label= betas2, s=15, figsize=(8,7))
plt.axis('equal')
plot2D(Y_fitsne_scale2, label= degrees2, s=15, figsize=(8,7))
plt.axis('equal')
plot2D(Y_fitsne_scale2, label=meandist2, s=15, figsize=(8,7))
plt.axis('equal')
clst_means = [[0, 0], [15, 15]]
clst_std = [1, 2]
clst_n_samples = [2500, 2500]
X3, color3 = make_blobs(
n_samples=clst_n_samples, n_features=len(clst_means[0]),
centers=clst_means, cluster_std = clst_std, random_state=0)
idx = np.argsort(color3)
X3 = X3[idx, :]
color3 = color3[idx]
color3 = np.array([["C1", "C2"][i] for i in color3])
plot2D(X3, label=color3, s=15, figsize=(7,7),
col_map = ["salmon", "mediumaquamarine"], alpha = 0.5)
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
Z = diffusion_tsne(
X3, perplexity= 300, seed=123, nthreads=nthreads,
scale_probs=False, load_affinities = "save only")
betas3 = np.fromfile('betas.dat', dtype=np.dtype('d'))
plot2D(X3, label=1/(2*betas3), s=15, figsize=(8,7))
plt.axis('equal')
from sklearn.manifold import TSNE
start = time.time()
Y3 = TSNE(n_components=2, perplexity=perp).fit_transform(X3)
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y3, label=color3, s=15, figsize=(7,7),
col_map = ["salmon", "mediumaquamarine"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fixed3 = diffusion_tsne(
X3, perplexity=-1, sigma = 2, K = perp,
seed=123, nthreads=nthreads,
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
print('Fixed bandwidth t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fixed3, label=color3, s=15, figsize=(7,7),
col_map =["salmon", "mediumaquamarine"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fitsne3 = diffusion_tsne(
X3, perplexity=perp, seed=123, nthreads=nthreads,
scale_probs=False, load_affinities = "save",
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
betas3 = np.fromfile('betas.dat', dtype=np.dtype('d'))
degrees3 = np.fromfile('affinity_rowsums.dat', dtype=np.dtype('d'))
meandist3 = np.fromfile('mean_dists.dat', dtype=np.dtype('d'))
plot2D(Y_fitsne3, label=color3, s=15, figsize=(7,7),
col_map = ["salmon", "mediumaquamarine"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fitsne_scale3 = diffusion_tsne(
X3, perplexity=500, seed=3587, nthreads=nthreads,
max_iter=5000, early_exag_coeff = 12, learning_rate = 200,
scale_probs=True, load_affinities = "save",
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fitsne_scale3, label=color3, s=15, figsize=(7,7),
col_map = ["salmon", "mediumaquamarine"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
plot2D(Y_fitsne_scale3, label= betas3, s=15, figsize=(8,7))
plt.axis('equal')
plot2D(Y_fitsne_scale3, label= degrees3, s=15, figsize=(8,7))
plt.axis('equal')
plot2D(Y_fitsne_scale3, label=meandist3, s=15, figsize=(8,7))
plt.axis('equal')
clst_means = [[0, 0], [15, 15]]
clst_std = [1, 3]
clst_n_samples = [3000, 1500]
X4, color4 = make_blobs(
n_samples=clst_n_samples, n_features=len(clst_means[0]),
centers=clst_means, cluster_std = clst_std, random_state=0)
idx = np.argsort(color4)
X4 = X4[idx, :]
color4 = color4[idx]
color4 = np.array([["C1", "C2"][i] for i in color4])
plot2D(X4, label=color4, s=15, figsize=(7,7),
col_map = ["indianred", "teal"], alpha = 0.5)
plt.legend(markerscale=4)
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
Z = diffusion_tsne(
X4, perplexity= 300, seed=123, nthreads=nthreads,
scale_probs=False, load_affinities = "save only")
betas4 = np.fromfile('betas.dat', dtype=np.dtype('d'))
plot2D(X4, label=1/(2*betas4), s=15, figsize=(8,7))
plt.axis('equal')
from sklearn.manifold import TSNE
start = time.time()
Y4 = TSNE(n_components=2, perplexity=perp).fit_transform(X4)
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y4, label=color4, s=15, figsize=(7,7),
col_map = ["indianred", "teal"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fixed4 = diffusion_tsne(
X4, perplexity=-1, sigma = 2, K = perp,
seed=435, nthreads=nthreads,
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
print('Fixed bandwidth t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fixed4, label=color4, s=15, figsize=(7,7),
col_map = ["indianred", "teal"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
start = time.time()
Y_fitsne4 = diffusion_tsne(
X4, perplexity=perp, seed=123, nthreads=nthreads,
scale_probs=False, load_affinities = "save",
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
betas4 = np.fromfile('betas.dat', dtype=np.dtype('d'))
degrees4 = np.fromfile('affinity_rowsums.dat', dtype=np.dtype('d'))
meandist4 = np.fromfile('mean_dists.dat', dtype=np.dtype('d'))
print('t-SNE embedding in %f sec' %(end-start))
plot2D(Y_fitsne4, label=color4, s=15, figsize=(7,7),
col_map = ["indianred", "teal"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
plot2D(X4, label= betas4, s=15, figsize=(8,7))
plt.axis('equal')
start = time.time()
Y_fitsne_scale4 = diffusion_tsne(
X4, perplexity=perp, seed=42, nthreads=nthreads,
max_iter=10000, early_exag_coeff = 20, learning_rate = 1500,
scale_probs=True, load_affinities = "save",
knn_algo='vp-tree', nbody_algo = 'Barnes-Hut')
end = time.time()
print('t-SNE embedding in %f sec' %(end-start))
betas4 = np.fromfile('betas.dat', dtype=np.dtype('d'))
degrees4 = np.fromfile('affinity_rowsums.dat', dtype=np.dtype('d'))
meandist4 = np.fromfile('mean_dists.dat', dtype=np.dtype('d'))
plot2D(Y_fitsne_scale4, label=color4, s=15, figsize=(7,7),
col_map = ["indianred", "teal"], alpha = 0.5)
plt.axis('equal')
leg = plt.legend(markerscale = 4)
for lh in leg.legendHandles:
lh.set_alpha(1)
z = 0.1 + betas4.shape[0] * (1 - 0.1)* betas4 / np.sum(betas4)
y = betas4.shape[0] * betas4 / np.sum(betas4)
print(np.min(betas4))
print(np.max(betas4))
print(np.min(y))
print(np.max(y))
print(np.min(z))
print(np.max(z))
np.sum(z)
plot2D(Y_fitsne_scale4, label= betas4, s=15, figsize=(8,7))
plt.axis('equal')
plot2D(Y_fitsne_scale4, label= degrees4, s=15, figsize=(8,7))
plt.axis('equal')
plot2D(Y_fitsne_scale4, label=meandist4, s=15, figsize=(8,7))
plt.axis('equal')